{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_uuid": "c1fdd129c1cbab68ae3e6bf2062575f01f80b87c"
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_credit = pd.read_csv('./Resources/tmdb_5000_credits.csv')\n",
    "df = pd.read_csv('./Resources/tmdb_5000_movies.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "_uuid": "c87bda9d56a936be126d03eda0bc743ee35be461"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>budget</th>\n",
       "      <th>genres</th>\n",
       "      <th>homepage</th>\n",
       "      <th>id</th>\n",
       "      <th>keywords</th>\n",
       "      <th>original_language</th>\n",
       "      <th>original_title</th>\n",
       "      <th>overview</th>\n",
       "      <th>popularity</th>\n",
       "      <th>production_companies</th>\n",
       "      <th>...</th>\n",
       "      <th>runtime</th>\n",
       "      <th>spoken_languages</th>\n",
       "      <th>status</th>\n",
       "      <th>tagline</th>\n",
       "      <th>title</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>tittle</th>\n",
       "      <th>cast</th>\n",
       "      <th>crew</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>237000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://www.avatarmovie.com/</td>\n",
       "      <td>19995</td>\n",
       "      <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
       "      <td>en</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
       "      <td>150.437577</td>\n",
       "      <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
       "      <td>...</td>\n",
       "      <td>162.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
       "      <td>Released</td>\n",
       "      <td>Enter the World of Pandora.</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>7.2</td>\n",
       "      <td>11800</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
       "      <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>300000000</td>\n",
       "      <td>[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...</td>\n",
       "      <td>http://disney.go.com/disneypictures/pirates/</td>\n",
       "      <td>285</td>\n",
       "      <td>[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...</td>\n",
       "      <td>en</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
       "      <td>139.082615</td>\n",
       "      <td>[{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"...</td>\n",
       "      <td>...</td>\n",
       "      <td>169.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>At the end of the world, the adventure begins.</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>6.9</td>\n",
       "      <td>4500</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...</td>\n",
       "      <td>[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>245000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://www.sonypictures.com/movies/spectre/</td>\n",
       "      <td>206647</td>\n",
       "      <td>[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...</td>\n",
       "      <td>en</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>A cryptic message from Bond’s past sends him o...</td>\n",
       "      <td>107.376788</td>\n",
       "      <td>[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...</td>\n",
       "      <td>...</td>\n",
       "      <td>148.0</td>\n",
       "      <td>[{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...</td>\n",
       "      <td>Released</td>\n",
       "      <td>A Plan No One Escapes</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>6.3</td>\n",
       "      <td>4466</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...</td>\n",
       "      <td>[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>250000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...</td>\n",
       "      <td>http://www.thedarkknightrises.com/</td>\n",
       "      <td>49026</td>\n",
       "      <td>[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...</td>\n",
       "      <td>en</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>Following the death of District Attorney Harve...</td>\n",
       "      <td>112.312950</td>\n",
       "      <td>[{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"...</td>\n",
       "      <td>...</td>\n",
       "      <td>165.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>The Legend Ends</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>7.6</td>\n",
       "      <td>9106</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...</td>\n",
       "      <td>[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>260000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://movies.disney.com/john-carter</td>\n",
       "      <td>49529</td>\n",
       "      <td>[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...</td>\n",
       "      <td>en</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>John Carter is a war-weary, former military ca...</td>\n",
       "      <td>43.926995</td>\n",
       "      <td>[{\"name\": \"Walt Disney Pictures\", \"id\": 2}]</td>\n",
       "      <td>...</td>\n",
       "      <td>132.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>Lost in our world, found in another.</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>6.1</td>\n",
       "      <td>2124</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...</td>\n",
       "      <td>[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      budget                                             genres  \\\n",
       "0  237000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "1  300000000  [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...   \n",
       "2  245000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "3  250000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...   \n",
       "4  260000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "\n",
       "                                       homepage      id  \\\n",
       "0                   http://www.avatarmovie.com/   19995   \n",
       "1  http://disney.go.com/disneypictures/pirates/     285   \n",
       "2   http://www.sonypictures.com/movies/spectre/  206647   \n",
       "3            http://www.thedarkknightrises.com/   49026   \n",
       "4          http://movies.disney.com/john-carter   49529   \n",
       "\n",
       "                                            keywords original_language  \\\n",
       "0  [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...                en   \n",
       "1  [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...                en   \n",
       "2  [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...                en   \n",
       "3  [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...                en   \n",
       "4  [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...                en   \n",
       "\n",
       "                             original_title  \\\n",
       "0                                    Avatar   \n",
       "1  Pirates of the Caribbean: At World's End   \n",
       "2                                   Spectre   \n",
       "3                     The Dark Knight Rises   \n",
       "4                               John Carter   \n",
       "\n",
       "                                            overview  popularity  \\\n",
       "0  In the 22nd century, a paraplegic Marine is di...  150.437577   \n",
       "1  Captain Barbossa, long believed to be dead, ha...  139.082615   \n",
       "2  A cryptic message from Bond’s past sends him o...  107.376788   \n",
       "3  Following the death of District Attorney Harve...  112.312950   \n",
       "4  John Carter is a war-weary, former military ca...   43.926995   \n",
       "\n",
       "                                production_companies  ... runtime  \\\n",
       "0  [{\"name\": \"Ingenious Film Partners\", \"id\": 289...  ...   162.0   \n",
       "1  [{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"...  ...   169.0   \n",
       "2  [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...  ...   148.0   \n",
       "3  [{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"...  ...   165.0   \n",
       "4        [{\"name\": \"Walt Disney Pictures\", \"id\": 2}]  ...   132.0   \n",
       "\n",
       "                                    spoken_languages    status  \\\n",
       "0  [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...  Released   \n",
       "1           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "2  [{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...  Released   \n",
       "3           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "4           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "\n",
       "                                          tagline  \\\n",
       "0                     Enter the World of Pandora.   \n",
       "1  At the end of the world, the adventure begins.   \n",
       "2                           A Plan No One Escapes   \n",
       "3                                 The Legend Ends   \n",
       "4            Lost in our world, found in another.   \n",
       "\n",
       "                                      title vote_average vote_count  \\\n",
       "0                                    Avatar          7.2      11800   \n",
       "1  Pirates of the Caribbean: At World's End          6.9       4500   \n",
       "2                                   Spectre          6.3       4466   \n",
       "3                     The Dark Knight Rises          7.6       9106   \n",
       "4                               John Carter          6.1       2124   \n",
       "\n",
       "                                     tittle  \\\n",
       "0                                    Avatar   \n",
       "1  Pirates of the Caribbean: At World's End   \n",
       "2                                   Spectre   \n",
       "3                     The Dark Knight Rises   \n",
       "4                               John Carter   \n",
       "\n",
       "                                                cast  \\\n",
       "0  [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...   \n",
       "1  [{\"cast_id\": 4, \"character\": \"Captain Jack Spa...   \n",
       "2  [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...   \n",
       "3  [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...   \n",
       "4  [{\"cast_id\": 5, \"character\": \"John Carter\", \"c...   \n",
       "\n",
       "                                                crew  \n",
       "0  [{\"credit_id\": \"52fe48009251416c750aca23\", \"de...  \n",
       "1  [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...  \n",
       "2  [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...  \n",
       "3  [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...  \n",
       "4  [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_credit.columns = ['id','tittle','cast','crew']\n",
    "df = df.merge(df_credit,on='id', how = 'left')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "ee603279675033fc397f0c94738e20b34f35312b"
   },
   "source": [
    "# **Demographic Filtering** -\n",
    "   Before getting started with this  -\n",
    "* we need a metric to score or rate movie \n",
    "* Calculate the score for every movie \n",
    "* Sort the scores and recommend the best rated movie to the users.\n",
    "\n",
    "We can use the average ratings of the movie as the score but using this won't be fair enough since a movie with 8.9 average rating and only 3 votes cannot be considered better than the movie with 7.8 as as average rating but 40 votes.\n",
    "So, I'll be using IMDB's weighted rating (wr) which is given as :-\n",
    "\n",
    "![](https://image.ibb.co/jYWZp9/wr.png)\n",
    "where,\n",
    "* v is the number of votes for the movie;\n",
    "* m is the minimum votes required to be listed in the chart;\n",
    "* R is the average rating of the movie; And\n",
    "* C is the mean vote across the whole report\n",
    "\n",
    "We already have v(**vote_count**) and R (**vote_average**) and C can be calculated as "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "_uuid": "5799b99c5e5ed5b7723ae8b31e1fc9fb1e7b89ec"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(481, 24)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# IMDB weighted rating wr: v*R/(v+m) + m*C/(v+m)\n",
    "# v is number of votes for certain movie\n",
    "# m is the minimum votes required to be listed in the chart\n",
    "# R is the average rating of the movie\n",
    "# C is the mean vote across the whole report\n",
    "C = df['vote_average'].mean()\n",
    "# m is determined: 90th percentile as cutoff.\n",
    "m = df['vote_count'].quantile(0.9)\n",
    "q_movies = df.copy().loc[df['vote_count'] >= m]\n",
    "def weighted_rating(x, m=m, C=C):\n",
    "    v = x['vote_count']\n",
    "    R = x['vote_average']\n",
    "    # Calculation based on the IMDB formula\n",
    "    return (v/(v+m) * R) + (m/(m+v) * C)\n",
    "\n",
    "# Define a new feature 'score' and calculate its value with `weighted_rating()`\n",
    "q_movies['score'] = q_movies.apply(weighted_rating, axis=1)\n",
    "#Sort movies based on score calculated above\n",
    "q_movies = q_movies.sort_values('score', ascending=False)\n",
    "q_movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "_uuid": "207f7058f92698b5fd776f7771a3ac0cc2928bf1"
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAywAAAEWCAYAAACE8BN/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3debhdVX3/8feHBIUwBQmCECRarRoGA0RFBAtqW1ERVKwgVeBnobSCOP0K1glaB6RWKmChFBWFMAgiUurPAQRFJkkIEAStFlGZVBBQBhHw+/vjrCuHy703N8lNzr7m/Xqe+2TvtdZZa+3DeQ75ZO21b6oKSZIkSeqiVQY9AUmSJEkajYFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJGibJjkluHvQ8lkSS7yXZcdDzkKSJZmCRJHVakpuSPJDk3iQ/T3JSkjUHPa8lkeSwJJXk4GHlB7fyw5Z1jKrarKouWtZ+JKlrDCySpMlgl6paE9gamAu8b8DzGVWSqaNU/Q/w5mFle7dySdIoDCySpEmjqm4B/h+wOUCSjZKcm+RXSX6UZL+htm1V46wkZyT5TZKrkjy3r76SPKPv/KQkHxpp3CSHJvnf1s/1SV7TV7dPkkuSHJXkTuCwUaZ/JTAtyWbtdZsBq7Xy/rH2a9fyq3ZtG7Xy45J8fFjbLyd5Zzu+KcnL2vEqfXO+M8kXkjyp1a2W5JRWfneSK5NsMNb7LkmDZGCRJE0aSTYBXgEsbEWnAzcDGwG7Ax9J8pK+l+wKnAk8CTgVOCfJqksx9P8COwDrAIcDpyR5Sl/9C4AbgQ2AD4/Rz8k8usqydzv/gzb3jwJ/BTwF+Am9awQ4DXhDkrS26wJ/0Vff7yBgN+DP6L03dwGf6ht3HWATYD3gAOCBMeYsSQNlYJEkTQbnJLkb+A7wLXrBZBPgRcAhVfXbqroaOJHH3na1oKrOqqqHgE/QW9HYdkkHr6ozq+rWqvp9VZ0B/BB4fl+TW6vqmKp6uKrG+sv/KcCeLTTt0c777QV8pqquqqoHgfcAL0wyC7gYKHrBCXoB7bKqunWEcQ4A3ltVN7d+DgN2b7erPUQvqDyjqh6pqgVV9evxvheStKIZWCRJk8FuVTW9qjatqr9voWAj4FdV9Zu+dj8BNu47/9nQQVX9nkdXY5ZIkjcnubrdQnU3vVvSZow0zliq6qfAj4CPAD+squGv26hdw1D7e4E7gY2rquitpuzZqt8IzBtlqE2BL/XN9wbgEXorQCcDXwNOT3JrkiOXctVJklYIA4skabK6FXhSkrX6yp4K3NJ3vsnQQZJVgJntdQD3A9P62m440iBJNgX+EzgQWK+qpgPXAelrVksw788D72p/DncrvbAxNPYa9FZDhq7pNHorJZvSuw3ti6OM8TNg5xbyhn5Wq6pbquqhqjq8qmYD2wGv4vEPA5CkzjCwSJImpbY6cSnw0baRfEvgLTz2Nqttkry23Qr1duBB4PJWdzXwxiRTkryc3n6PkaxBL5D8EiDJvrRN/0vpDHp7T74wQt1pwL5J5iR5Ir2VmCuq6iaAqloI3EHv1revVdXdo4xxPPDhFmxIsn6SXdvxTkm2SDIF+DW9W8R+vwzXI0nLlYFFkjSZ7QnMorcy8SXgg1V1fl/9l4E30Nt0/ibgtW0/C8DBwC7A3fT2jpwz0gBVdT3wr8BlwM+BLYBLlnbCVfVAVZ0/0l6XNvf301s5uQ34E3p7XfqdCrys/TmaTwLnAl9P8ht6Ie0FrW5D4Cx6YeUGenuCTh6pE0nqgvRuiZUk6Y9L+2WMz6iqvx70XCRJS88VFkmSJEmdZWCRJEmS1FneEiZJkiSps1xhkSRJktRZUwc9AXXbjBkzatasWYOehiRJkv6ILViw4I6qWn+kOgOLxjRr1izmz58/6GlIkiTpj1iSn4xW5y1hkiRJkjrLwCJJkiSpswwskiRJkjrLwCJJkiSpswwskiRJkjrLwCJJkiSpswwskiRJkjrLwCJJkiSps/zFkRrT7fc/zBEL7xj0NCRJkrQcHbrVjEFPYVSusEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysKxASSrJKX3nU5P8Msl57fzVSQ5dTB8bJTlrec9VkiRJ6oKpg57ASuY+YPMkq1fVA8CfA7cMVVbVucC5Y3VQVbcCuy/XWUqSJEkd4QrLivcV4JXteE/gtKGKJPskObYdn5Tk6CSXJrkxye6tfFaS69rxakk+m2RRkoVJdurr5+wkX03ywyRHtvIprd/r2mvesQKvW5IkSVpirrCseKcDH2i3gW0JfAbYYZS2TwG2B55Nb+Vl+K1gbwWqqrZI8mzg60n+tNXNAbYCHgR+kOQY4MnAxlW1OUCS6SMNmmR/YH+A6RvOXKqLlCRJkiaCKywrWFVdC8yit7rylcU0P6eqfl9V1wMbjFC/PXBK6/f7wE+AocByQVXdU1W/Ba4HNgVuBJ6e5JgkLwd+PcocT6iquVU1d41111uyC5QkSZImkIFlMM4FPk7f7WCjeLDvOEs4Rv9rHwGmVtVdwHOBi4ADgBOXsE9JkiRphTKwDMZngMOratEy9nMxsBdAuxXsqcAPRmucZAawSlV9EXgfsPUyji9JkiQtV+5hGYCquhk4egK6+nfguCSLgIeBfarqwWTUxZiNgc8mGQqq75mAOUiSJEnLTapq0HNQh82cPacOnHf+oKchSZKk5ejQrWYMdPwkC6pq7kh13hImSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y990rzFtOG3qwH+RkCRJklZerrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iyfEqYx3X7/wxyx8I5BT0OSJE0SPl1UE80VFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGliWQ5N5xtHl7kmkTNN6OSbYbR7t9khzbjg9L8u6JGF+SJEkaNAPLxHs7sESBJcmUUap2BBYbWJZFkqnLs39JkiRpWRhYlkJb+bgoyVlJvp9kXnreBmwEXJjkwtb2L5JcluSqJGcmWbOV35TkY0muAl6f5G1Jrk9ybZLTk8wCDgDekeTqJDskWT/JF5Nc2X5etJh57tfaXdNeN62Vn5Tk+CRXAEcuv3dKkiRJWjb+6/rS2wrYDLgVuAR4UVUdneSdwE5VdUeSGcD7gJdV1X1JDgHeCfxT6+POqtoaIMmtwNOq6sEk06vq7iTHA/dW1cdbm1OBo6rqO0meCnwNeM4Yczy7qv6zvfZDwFuAY1rdTGC7qnpk+IuS7A/sDzB9w5lL+fZIkiRJy87AsvS+W1U3AyS5GpgFfGdYm22B2cAlSQCeAFzWV39G3/G1wLwk5wDnjDLmy4DZrS+AtYdWbEaxeQsq04E16QWcIWeOFFYAquoE4ASAmbPn1Bj9S5IkScuVgWXpPdh3/Agjv5cBvlFVe47Sx319x68EXgzsArw3yRYjtF8F2LaqfvuYQR4NMMOdBOxWVdck2YfenpiRxpYkSZI6yT0sE+83wFrt+HLgRUmeAZBkjSR/OvwFSVYBNqmqC4FDgHXorYj09wXwdeCgvtfNWcxc1gJuS7IqsNfSXY4kSZI0OAaWiXcC8NUkF1bVL4F9gNOSXEvvdrBnj/CaKcApSRYBC4Gjq+pu4L+A1wxtugfeBsxtG/Ovp7cpfyzvB66gt8fm+xNwbZIkSdIKlSq3KGh0M2fPqQPnnT/oaUiSpEni0K1mDHoKmoSSLKiquSPVucIiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6a+qgJ6Bu23DaVH8BlCRJkgbGFRZJkiRJnWVgkSRJktRZBhZJkiRJnWVgkSRJktRZBhZJkiRJneVTwjSm2+9/mCMW3jHoaUiSJohPfpQ02bjCIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAMkGSPJLk6iTfS3JNknclmZD3N8lhSd49EX319blPkmMnsk9JkiRpok0d9AT+iDxQVXMAkjwZOBVYG/jgQGclSZIkTWKusCwHVfULYH/gwPRMSfIvSa5Mcm2SvwVIsmaSC5JclWRRkl2H+kjy3iT/k+Q7wLP6yi9K8sm2mnNdkue38iclOaf1f3mSLccqlyRJkiYDV1iWk6q6MckU4MnArsA9VfW8JE8ELknydeBnwGuq6tdJZgCXJzkX2BrYA5hD77/RVcCCvu6nVdWcJC8GPgNsDhwOLKyq3ZK8BPh8e/1o5aNKsj+9wMX0DWdOyPshSZIkLQ0Dy4rxF8CWSXZv5+sAzwRuBj7SgsfvgY2BDYAdgC9V1f0ALcT0Ow2gqr6dZO0k04Htgde18m8mWS/J2mOUj6qqTgBOAJg5e04t26VLkiRJS8/AspwkeTrwCPALIMBBVfW1YW32AdYHtqmqh5LcBKw2ju6HhwhDhSRJkv4ouYdlOUiyPnA8cGxVFfA14O+SrNrq/zTJGvRWWn7RwspOwKati28DuyVZPclawC7DhnhD62d7erea3QNcDOzVyncE7qiqX49RLkmSJHWeKywTZ/UkVwOrAg8DJwOfaHUnArOAq5IE+CWwGzAP+K8ki4D5wPcBquqqJGcA19Bbobly2Fi/TbKwjfV/WtlhwGeSXAvcD+y9mHJJkiSp89JbANBkkeQi4N1VNX9FjDdz9pw6cN75K2IoSdIKcOhWMwY9BUl6nCQLqmruSHXeEiZJkiSps7wlbJKpqh0HPQdJkiRpRXGFRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaPNdaYNpw21V8yJkmSpIFxhUWSJElSZxlYJEmSJHWWgUWSJElSZxlYJEmSJHWWgUWSJElSZ/mUMI3p9vsf5oiFdwx6GpK00vDJjJL0WK6wSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzlpsYEmyQZJTk9yYZEGSy5K8ZnlMJsmsJNe147lJjl4e4yxmDusnuSLJwiQ7DKt7e5Jpfef3LuNYz09yUZIfJrkqyX8n2WIxr9knybHLMq4kSZI0WUwdqzJJgHOAz1XVG1vZpsCrJ2LwJFOq6pGR6qpqPjB/IsZZQi8FFlXV34xQ93bgFOD+ZR0kyQbAF4A3VtWlrWx74E+ARcvavyRJkvTHYHErLC8BfldVxw8VVNVPquoYePy/9ic5L8mO7fi4JPOTfC/J4X1tbkrysSRXAa9Psk2Sa5JcA7y1r92OSc5rx89vKzsLk1ya5Fl945+d5KttleLIVj4lyUlJrkuyKMk7hl9YW835ZpJrk1yQ5KlJ5gBHArsmuTrJ6n3t3wZsBFyY5MK+8g+3+V/eQsjQKs0Xk1zZfl40wnt7IL0geGnfe/udqjqn9bFL30rP+UN9D7uGEdsk+WSSD7Tjv0zy7STrJPlxklVb+dr955IkSVIXLS6wbAZctZR9v7eq5gJbAn+WZMu+ujurauuqOh34LHBQVT13jL6+D+xQVVsBHwA+0lc3B3gDsAXwhiSbtLKNq2rzqtqijTHcMfQCw5bAPODoqrq69X9GVc2pqgeGGlfV0cCtwE5VtVMrXgO4vM3928B+rfyTwFFV9TzgdcCJI4y/uPf2O8C27ZpPB/5hCdq8p70XOwFHA/tW1T3ARcArW5s9gLOr6qHhnSbZv4XN+ffddecYU5QkSZKWrzFvCRsuyaeA7emtujxvMc3/Ksn+bYynALOBa1vdGa2/6cD0qvp2Kz8Z2HmEvtYBPpfkmUAB/asCF7S/jJPkemBT4HvA05McA/w38PUR+nwh8Nq+cY9czPWM5HfAee14AfDn7fhlwOzeHXUArJ1kzaoadc9LkiuAtYGvV9XBwEzgjCRPAZ4A/HiEl43YpqruT7IfvRD1jqr639b+RHqh5hxgXx4NWI9RVScAJwDMnD2nxnwHJEmSpOVocSss3wO2HjqpqrfS2+Oxfit6eFgfqwEkeRrwbuClbQXjv4fqmvuWcJ7/DFxYVZsDuwzr68G+40eAqVV1F/BceisKBzDyCsdEeKiqhv5C/wiPBsBV6K18zGk/G48QVoa/ty8A3k8vnEFvBejYtkL0tzz2mhlHmy2AO+ndxjY0xiXArHbb3pSqum5JL1iSJElakRYXWL4JrJbk7/rKpvUd3wTMSbJKuxXr+a18bXqh5J62r2KkVROq6m7g7rbZHGCvUeaxDnBLO95nMXMmyQxglar6IvA++oJBn0vp3RY1NO7Fi+sX+A2w1jjafR04qG8+c0Zo8ylgnyTb9ZX1v7f917z3KOOM2KY9GOFdwFbAzkle0PeazwOnMvJtcpIkSVKnjBlY2urBbvT2oPw4yXeBzwGHtCaX0LsN6Xp6eyWuaq+7BlhIb+/Jqa3daPYFPpXkaiCjtDkS+GiShYzvNraNgYtan6fQ29Mx3EHAvkmuBd4EHDyOfk8Avtq/6X4UbwPmtg3919Nb5XmMqrqd3t6bjyb5UZJLgd2BoYcYHAacmWQBcMco4zyuTXuy26eBd1fVrcBbgBOTDK2+zAPWBU4bx/VKkiRJA5VH72jSyiDJ7sCuVfWm8bSfOXtOHTjv/OU8K0nSkEO3mjHoKUjSCpdkQXtg1+Ms0aZ7TW7tIQQ7A68Y9FwkSZKk8TCwrESq6qDFt5IkSZK6Y3Gb7iVJkiRpYAwskiRJkjrLwCJJkiSpswwskiRJkjrLwCJJkiSpswwskiRJkjrLxxprTBtOm+ovMZMkSdLAuMIiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8caa0y33/8wRyy8Y9DTkKTlzke4S1I3ucIiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbNWeGBJUklO6TufmuSXSc5bwn4uSjJ3lPKfJklf2TlJ7l22mY86hx8kubr97L4cxjgsyS2t/+uT7DmBfU/4eyJJkiRNpEGssNwHbJ5k9Xb+58AtEzzG3cCLAJJMB54ywf3326uq5rSfs8b7oiRTl2CMo6pqDrAr8B9JVl3iWUqSJEmT0KBuCfsK8Mp2vCdw2lBFkucnuSzJwiSXJnlWK189yelJbkjyJWD1x3f7B6cDe7Tj1wJn9/W/ZpILklyVZFGSXVv585Jcm2S1JGsk+V6SzZf0wpLMSnJd3/m7kxzWji9K8m9J5gPvTfLjofCRZO3+85FU1Q+B+4F122vmJLm8zftLSYbK90tyZZJrknwxybRW/rT23i5K8qElvTZJkiRpRRtUYDkd2CPJasCWwBV9dd8HdqiqrYAPAB9p5X8H3F9VzwE+CGwzRv8XAC9OMoVecDmjr+63wGuqamtgJ+Bfk6SqrgTOBT4EHAmcUlXXASS5eoyx5vXdErbeOK79CVU1t6oOBy7i0eC2B3B2VT002guTbA38sKp+0Yo+DxxSVVsCi+i9L7R+nldVzwVuAN7Syj8JHFdVWwC3jTHO/knmJ5l/3113juOSJEmSpOVjIIGlqq4FZtFbXfnKsOp1gDPbKsVRwGat/MXAKX2vv3aMIR4BvkMvBKxeVTf11QX4SJJrgfOBjYENWt0/0btFbS690DI03zljjNV/S9h4/nbfH55OBPZtx/sCnx3lNe9I8j16we7DAEnWAaZX1bdam8/Re4+gd8vdxUkWAXvx6Hv4Ih5dzTp5tAlW1QktVM1dY93xZDBJkiRp+RjkU8LOBT5O3+1gzT8DF1bV5sAuwGpL2f/pwNHAF4aV7wWsD2zTgsjP+8ZYD1gTWGsZxn2Yx76vw/u5b+igqi4BZiXZEZgytKIzgqOqajPgdcCn28rUWE4CDmwrKYcPm0Mt9gokSZKkjhhkYPkMcHhVLRpWvg6PbsLfp6/828AbAdreki0X0//FwEd5fCBaB/hFVT2UZCdg0766/wDeD8wDPja+y3icnwNPTrJekicCr1pM+88DpzL66sofVNW5wHxg76q6B7gryQ6t+k3A0GrLWsBtbT/MXn1dXMKje3v6yyVJkqROGlhgqaqbq+roEaqOBD6aZCHQ/ySt44A1k9xA79atBYvpv6rq41V1x7CqecDcdrvUm+ntmSHJm4GHqupU4AjgeUle0urG2sMyfNyH2vy+C3xjqP8xzKO3iX54sBrNPwHvTLIKsDfwL+32tjmtDnqh6wp6AaV//IOBt7Zr33ic40mSJEkDkyrvEBqk9rtbdq2qNw16LiOZOXtOHTjv/EFPQ5KWu0O3mjHoKUjSSivJgqp63O9YhMeuYGgFS3IMsDPwikHPRZIkSeoiA8sAVdVBg56DJEmS1GWD3HQvSZIkSWMysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLB9rrDFtOG2qv0xNkiRJA+MKiyRJkqTOMrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iwfa6wx3X7/wxyx8I5BT0OSloqPZZekyc8VFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdtVIFliT3DmDMS8fZbnqSO5Oknb8wSSWZ2c7XSfKrJOP+b5bksCTvHqF8VpLrxtuPJEmSNCgrVWBZEkmmTEQ/VbXdONvdDdwGPKcVbQcsbH8CbAt8t6p+P57+kkxdwqlKkiRJnbPSBZYkOyY5r+/82CT7tOObknwsyVXA65Psl+TKJNck+WKSaa3d65Nc18q/3co2S/LdJFcnuTbJM1v5vX1jHZJkUXvdESNM71IeDSjbAUcNO7+k9TMnyeVtnC8lWbeVX5Tk35LMBw4edt3btHGvAd66TG+iJEmStIKsdIFlHO6sqq2r6nTg7Kp6XlU9F7gBeEtr8wHgL1v5q1vZAcAnq2oOMBe4ub/TJDsDuwIvaK87coSxL+HRgPJ04MzWF6186PayzwOHVNWWwCLgg319PKGq5lbVvw7r+7PAQW3sMSXZP8n8JPPvu+vOxTWXJEmSlhsDy+Od0Xe8eZKLkywC9gI2a+WXACcl2Q8YunXsMuAfkxwCbFpVDwzr92XAZ6vqfoCq+tUIY18KbJfkacBNVfVbIEnWBLYBrkiyDjC9qr7VXvM54MWjzB96HUxvr/l2Kzp5rDegqk5ooWfuGuuuN1ZTSZIkablaGQPLwzz2ulcbVn9f3/FJwIFVtQVw+FDbqjoAeB+wCbAgyXpVdSq91ZYHgK8kecmSTqyqfghMB3ahF4AAFgD70gsw43lowH2LbyJJkiRNDitjYPkJMDvJE9vKw0vHaLsWcFuSVemtsACQ5E+q6oqq+gDwS2CTJE8Hbqyqo4EvA1sO6+sbwL59+2CeNMqYl9PbfzIUWC4D3k7bv1JV9wB3Jdmh1b8J+NbwTvq1Df13J9m+Fe01VntJkiSpK1aaJ0m1p2Y9WFU/S/IF4Drgx/SexDWa9wNX0AslV9ALMAD/0jbVB7gAuAY4BHhTkoeA24GP9HdUVV9NMgeYn+R3wFeAfxxhzEuAVwDz2/ll9Paz9D8eeW/g+BZ+bqS3ArM4+wKfSVLA18fRXpIkSRq4VNWg57BCJHku8J9V9fxBz2UymTl7Th047/xBT0OSlsqhW80Y9BQkSeOQZEFVzR2pbqW4JSzJAcBp9PadSJIkSZokVopbwqrqeOD4Qc9DkiRJ0pJZKVZYJEmSJE1OBhZJkiRJnWVgkSRJktRZBhZJkiRJnWVgkSRJktRZBhZJkiRJnbVSPNZYS2/DaVP9xWuSJEkaGFdYJEmSJHWWgUWSJElSZxlYJEmSJHWWgUWSJElSZxlYJEmSJHWWgUWSJElSZxlYJEmSJHWWgUWSJElSZxlYJEmSJHVWqmrQc1CHJfkN8INBz0OT2gzgjkFPQpOenyNNBD9HWlZ+hpafTatq/ZEqpq7omWjS+UFVzR30JDR5JZnvZ0jLys+RJoKfIy0rP0OD4S1hkiRJkjrLwCJJkiSpswwsWpwTBj0BTXp+hjQR/BxpIvg50rLyMzQAbrqXJEmS1FmusEiSJEnqLAOLJEmSpM4ysGhESV6e5AdJfpTk0EHPR92VZJMkFya5Psn3khzcyp+U5BtJftj+XLeVJ8nR7bN1bZKtB3sF6ookU5IsTHJeO39akivaZ+WMJE9o5U9s5z9q9bMGOW91R5LpSc5K8v0kNyR5od9FWhJJ3tH+X3ZdktOSrOZ30eAZWPQ4SaYAnwJ2BmYDeyaZPdhZqcMeBt5VVbOBbYG3ts/LocAFVfVM4IJ2Dr3P1TPbz/7AcSt+yuqog4Eb+s4/BhxVVc8A7gLe0srfAtzVyo9q7SSATwJfrapnA8+l93nyu0jjkmRj4G3A3KraHJgC7IHfRQNnYNFIng/8qKpurKrfAacDuw54Tuqoqrqtqq5qx7+h9xeEjel9Zj7Xmn0O2K0d7wp8vnouB6YnecoKnrY6JslM4JXAie08wEuAs1qT4Z+hoc/WWcBLW3utxJKsA7wY+DRAVf2uqu7G7yItmanA6kmmAtOA2/C7aOAMLBrJxsDP+s5vbmXSmNpy+FbAFcAGVXVbq7od2KAd+/nSSP4N+Afg9+18PeDuqnq4nfd/Tv7wGWr197T2Wrk9Dfgl8Nl2a+GJSdbA7yKNU1XdAnwc+Cm9oHIPsAC/iwbOwCJpQiRZE/gi8Paq+nV/XfWen+4z1DWiJK8CflFVCwY9F01qU4GtgeOqaivgPh69/Qvwu0hja/ubdkuzRzAAAAQlSURBVKUXfjcC1gBePtBJCTCwaGS3AJv0nc9sZdKIkqxKL6zMq6qzW/HPh26vaH/+opX7+dJwLwJeneQmeregvoTeXoTp7bYMeOzn5A+foVa/DnDnipywOulm4OaquqKdn0UvwPhdpPF6GfDjqvplVT0EnE3v+8nvogEzsGgkVwLPbE/FeAK9DWfnDnhO6qh2v+6ngRuq6hN9VecCe7fjvYEv95W/uT2hZ1vgnr7bNbQSqqr3VNXMqppF7/vmm1W1F3AhsHtrNvwzNPTZ2r2191/NV3JVdTvwsyTPakUvBa7H7yKN30+BbZNMa/9vG/oM+V00YP6me40oySvo3VM+BfhMVX14wFNSRyXZHrgYWMSj+w/+kd4+li8ATwV+AvxVVf2q/U/gWHrL7PcD+1bV/BU+cXVSkh2Bd1fVq5I8nd6Ky5OAhcBfV9WDSVYDTqa3X+pXwB5VdeOg5qzuSDKH3oMbngDcCOxL7x9n/S7SuCQ5HHgDvSdgLgT+ht5eFb+LBsjAIkmSJKmzvCVMkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJK5UkjyS5Osl1Sc5MMm2C+98nybFL+Jq5SY5uxzsm2W4i5yRJk5mBRZK0snmgquZU1ebA74ADBjmZJFOran5Vva0V7QgYWCSpMbBIklZmFwPPAEjyzrbqcl2St7eyWUm+n2RekhuSnDW0IpPkpiQz2vHcJBcN7zzJLkmuSLIwyflJNmjlhyU5OcklwMltVeW8JLPoBah3tFWgHZL8OMmq7XVr959L0srAwCJJWiklmQrsDCxKsg2934r+AmBbYL8kW7WmzwL+vaqeA/wa+PslGOY7wLZVtRW935T9D311s4GXVdWeQwVVdRNwPHBUWwW6GLgIeGVrsgdwdlU9tCTXKkmTmYFFkrSyWT3J1cB84KfAp4HtgS9V1X1VdS9wNrBDa/+zqrqkHZ/S2o7XTOBrSRYB/xfYrK/u3Kp6YBx9nEgvTNH+/OwSjC9Jk97UQU9AkqQV7IGqmtNfkGSs9jXK+cM8+g9/q43y2mOAT1TVuUl2BA7rq7tvPJOtqkvarWk7AlOq6rrxvE6S/li4wiJJUm8vy25JpiVZA3hNKwN4apIXtuM30rvNC+AmYJt2/LpR+l0HuKUd7z3OufwGWGtY2eeBU3F1RdJKyMAiSVrpVdVVwEnAd4ErgBOramGr/gHw1iQ3AOsCx7Xyw4FPJpkPPDJK14cBZyZZANwxzun8F/CaoU33rWxeG/u0cV+UJP2RSNXwlW5JkgS9p4QB57VHIA9yHrsDu1bVmwY5D0kaBPewSJLUYUmOofc0s1cMei6SNAiusEiSJEnqLPewSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeqs/w814g4WQiJRSgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 864x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "pop= df.sort_values('popularity', ascending=False)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',\n",
    "        color='skyblue')\n",
    "plt.gca().invert_yaxis()\n",
    "plt.xlabel(\"Popularity\")\n",
    "plt.title(\"Popular Movies\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "_uuid": "a92da8cde39c61deef5a1b8efa31ed84cda7f5fe"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4803, 20978)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Import TfIdfVectorizer from scikit-learn\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'\n",
    "tfidf = TfidfVectorizer(stop_words='english')\n",
    "#Replace NaN with an empty string\n",
    "df['overview'] = df['overview'].fillna('')\n",
    "#Construct the required TF-IDF matrix by fitting and transforming the data\n",
    "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
    "#Output the shape of tfidf_matrix\n",
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "_uuid": "5eb17d12220eecab4faf01bbfd13e79d8e446537"
   },
   "outputs": [],
   "source": [
    "# Import linear_kernel\n",
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "# Compute the cosine similarity matrix\n",
    "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)\n",
    "#Construct a reverse map of indices and movie titles\n",
    "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "_uuid": "9c383fcbb916dce464b01adf980d26ad96aebe0e"
   },
   "outputs": [],
   "source": [
    "# Function that takes in movie title as input and outputs most similar movies\n",
    "def get_recommendations(title, cosine_sim=cosine_sim):\n",
    "    # Get the index of the movie that matches the title\n",
    "    idx = indices[title]\n",
    "    # Get the pairwsie similarity scores of all movies with that movie\n",
    "    sim_scores = list(enumerate(cosine_sim[idx]))\n",
    "    # Sort the movies based on the similarity scores\n",
    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "    # Get the scores of the 10 most similar movies\n",
    "    sim_scores = sim_scores[1:11]\n",
    "    # Get the movie indices\n",
    "    movie_indices = [i[0] for i in sim_scores]\n",
    "    # Return the top 10 most similar movies\n",
    "    return df['title'].iloc[movie_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "_uuid": "59a8d0991e3cae9a44a4b351e154fd1000724448"
   },
   "outputs": [],
   "source": [
    "# Parse the stringified features into their corresponding python objects\n",
    "from ast import literal_eval\n",
    "features = ['cast', 'crew', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df[feature] = df[feature].apply(literal_eval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "_uuid": "783b0e89f1c04a12ff51eb29cc68e93c818896cd"
   },
   "outputs": [],
   "source": [
    "# Get the director's name from the crew feature. If director is not listed, return NaN\n",
    "def get_director(x):\n",
    "    for i in x:\n",
    "        if i['job'] == 'Director':\n",
    "            return i['name']\n",
    "    return np.nan\n",
    "\n",
    "# Returns the list top 3 elements or entire list; whichever is more.\n",
    "def get_list(x):\n",
    "    if isinstance(x, list):\n",
    "        names = [i['name'] for i in x]\n",
    "        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
    "        if len(names) > 3:\n",
    "            names = names[:3]\n",
    "        return names\n",
    "    #Return empty list in case of missing/malformed data\n",
    "    return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "_uuid": "dd060c3c1d724de71555218f30cccafd4a8ad6af"
   },
   "outputs": [],
   "source": [
    "# Define new director, cast, genres and keywords features that are in a suitable form.\n",
    "df['director'] = df['crew'].apply(get_director)\n",
    "features = ['cast', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df[feature] = df[feature].apply(get_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "_uuid": "86af764c406a8b6184b37b57cfe499d20ce45f9c"
   },
   "outputs": [],
   "source": [
    "# Function to convert all strings to lower case and strip names of spaces\n",
    "def clean_data(x):\n",
    "    if isinstance(x, list):\n",
    "        return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
    "    else:\n",
    "        #Check if director exists. If not, return empty string\n",
    "        if isinstance(x, str):\n",
    "            return str.lower(x.replace(\" \", \"\"))\n",
    "        else:\n",
    "            return ''\n",
    "\n",
    "# Apply clean_data function to your features.\n",
    "features = ['cast', 'keywords', 'director', 'genres']\n",
    "\n",
    "for feature in features:\n",
    "    df[feature] = df[feature].apply(clean_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "_uuid": "20aef87703c408926f7617573ed043605207767f"
   },
   "outputs": [],
   "source": [
    "def create_soup(x):\n",
    "    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])\n",
    "df['soup'] = df.apply(create_soup, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "_uuid": "b66a1afc1083917d5ef136ccdcd9b50cca087e2b"
   },
   "outputs": [],
   "source": [
    "# Import CountVectorizer and create the count matrix\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "count = CountVectorizer(stop_words='english')\n",
    "count_matrix = count.fit_transform(df['soup'])\n",
    "# Compute the Cosine Similarity matrix based on the count_matrix\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)\n",
    "# Reset index of our main DataFrame and construct reverse mapping as before\n",
    "df = df.reset_index()\n",
    "indices = pd.Series(df.index, index = df['title'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "_uuid": "d1e0e02be7a9e71422d3a492834cb4f8434d1464"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65               The Dark Knight\n",
       "119                Batman Begins\n",
       "4638    Amidst the Devil's Wings\n",
       "1196                The Prestige\n",
       "3073           Romeo Is Bleeding\n",
       "3326              Black November\n",
       "1503                      Takers\n",
       "1986                      Faster\n",
       "303                     Catwoman\n",
       "747               Gangster Squad\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Dark Knight Rises', cosine_sim2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "_uuid": "d6c4df85a80d830b2905f69e0e59ebb3461db3b7"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "867      The Godfather: Part III\n",
       "2731      The Godfather: Part II\n",
       "4638    Amidst the Devil's Wings\n",
       "2649           The Son of No One\n",
       "1525              Apocalypse Now\n",
       "1018             The Cotton Club\n",
       "1170     The Talented Mr. Ripley\n",
       "1209               The Rainmaker\n",
       "1394               Donnie Brasco\n",
       "1850                    Scarface\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Godfather', cosine_sim2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
